Importing Libraries¶
In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn import metrics
from sklearn. metrics import mean_absolute_error, mean_squared_error, r2_score
loading the dataset¶
In [2]:
df = pd.read_csv('train.csv')
In [3]:
df.head()
Out[3]:
| User_ID | Product_ID | Gender | Age | Occupation | City_Category | Stay_In_Current_City_Years | Marital_Status | Product_Category_1 | Product_Category_2 | Product_Category_3 | Purchase | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1000001 | P00069042 | F | 0-17 | 10 | A | 2 | 0 | 3 | NaN | NaN | 8370 |
| 1 | 1000001 | P00248942 | F | 0-17 | 10 | A | 2 | 0 | 1 | 6.0 | 14.0 | 15200 |
| 2 | 1000001 | P00087842 | F | 0-17 | 10 | A | 2 | 0 | 12 | NaN | NaN | 1422 |
| 3 | 1000001 | P00085442 | F | 0-17 | 10 | A | 2 | 0 | 12 | 14.0 | NaN | 1057 |
| 4 | 1000002 | P00285442 | M | 55+ | 16 | C | 4+ | 0 | 8 | NaN | NaN | 7969 |
Data Understanding¶
In [4]:
df.shape
Out[4]:
(550068, 12)
In [5]:
df.describe()
Out[5]:
| User_ID | Occupation | Marital_Status | Product_Category_1 | Product_Category_2 | Product_Category_3 | Purchase | |
|---|---|---|---|---|---|---|---|
| count | 5.500680e+05 | 550068.000000 | 550068.000000 | 550068.000000 | 376430.000000 | 166821.000000 | 550068.000000 |
| mean | 1.003029e+06 | 8.076707 | 0.409653 | 5.404270 | 9.842329 | 12.668243 | 9263.968713 |
| std | 1.727592e+03 | 6.522660 | 0.491770 | 3.936211 | 5.086590 | 4.125338 | 5023.065394 |
| min | 1.000001e+06 | 0.000000 | 0.000000 | 1.000000 | 2.000000 | 3.000000 | 12.000000 |
| 25% | 1.001516e+06 | 2.000000 | 0.000000 | 1.000000 | 5.000000 | 9.000000 | 5823.000000 |
| 50% | 1.003077e+06 | 7.000000 | 0.000000 | 5.000000 | 9.000000 | 14.000000 | 8047.000000 |
| 75% | 1.004478e+06 | 14.000000 | 1.000000 | 8.000000 | 15.000000 | 16.000000 | 12054.000000 |
| max | 1.006040e+06 | 20.000000 | 1.000000 | 20.000000 | 18.000000 | 18.000000 | 23961.000000 |
In [6]:
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 550068 entries, 0 to 550067 Data columns (total 12 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 User_ID 550068 non-null int64 1 Product_ID 550068 non-null object 2 Gender 550068 non-null object 3 Age 550068 non-null object 4 Occupation 550068 non-null int64 5 City_Category 550068 non-null object 6 Stay_In_Current_City_Years 550068 non-null object 7 Marital_Status 550068 non-null int64 8 Product_Category_1 550068 non-null int64 9 Product_Category_2 376430 non-null float64 10 Product_Category_3 166821 non-null float64 11 Purchase 550068 non-null int64 dtypes: float64(2), int64(5), object(5) memory usage: 50.4+ MB
In [7]:
df.isna().sum()
# there is 173638 missing values in Product_Category_2 and 383247 missing values in Product_Category_3
Out[7]:
User_ID 0 Product_ID 0 Gender 0 Age 0 Occupation 0 City_Category 0 Stay_In_Current_City_Years 0 Marital_Status 0 Product_Category_1 0 Product_Category_2 173638 Product_Category_3 383247 Purchase 0 dtype: int64
Data Visualization¶
In [8]:
df.head()
Out[8]:
| User_ID | Product_ID | Gender | Age | Occupation | City_Category | Stay_In_Current_City_Years | Marital_Status | Product_Category_1 | Product_Category_2 | Product_Category_3 | Purchase | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1000001 | P00069042 | F | 0-17 | 10 | A | 2 | 0 | 3 | NaN | NaN | 8370 |
| 1 | 1000001 | P00248942 | F | 0-17 | 10 | A | 2 | 0 | 1 | 6.0 | 14.0 | 15200 |
| 2 | 1000001 | P00087842 | F | 0-17 | 10 | A | 2 | 0 | 12 | NaN | NaN | 1422 |
| 3 | 1000001 | P00085442 | F | 0-17 | 10 | A | 2 | 0 | 12 | 14.0 | NaN | 1057 |
| 4 | 1000002 | P00285442 | M | 55+ | 16 | C | 4+ | 0 | 8 | NaN | NaN | 7969 |
In [9]:
plt.figure(figsize = (8, 6))
sns.countplot(data = df, x = 'Gender')
Out[9]:
<Axes: xlabel='Gender', ylabel='count'>
count of Male gender is higher as compated to the Female
In [10]:
plt.figure(figsize=(8,6))
# creating barplot of Gender and purchase
sns.barplot(data = df, x = "Gender", y = "Purchase")
Out[10]:
<Axes: xlabel='Gender', ylabel='Purchase'>
Higher purchase have been done by male gender as compared to the female
In [11]:
sns.barplot(y = 'Purchase', x= 'Occupation', data = df)
Out[11]:
<Axes: xlabel='Occupation', ylabel='Purchase'>
Occupation 7, 12, 14, 17 has higher purchase
In [12]:
plt.figure(figsize = (10,6))
sns.barplot(x = "Occupation", y ="Purchase" , hue = "Gender", data = df)
Out[12]:
<Axes: xlabel='Occupation', ylabel='Purchase'>
Outliers Detection¶
In [13]:
sns.boxplot(data = df , x = "Gender", y = "Purchase")
Out[13]:
<Axes: xlabel='Gender', ylabel='Purchase'>
In [14]:
sns.catplot(data = df.sort_values('Purchase', ascending=False), kind = 'boxen', height = 7, aspect = 3 , x = "Gender", y = "Purchase")
D:\Projects\analysis\Lib\site-packages\seaborn\axisgrid.py:118: UserWarning: The figure layout has changed to tight self._figure.tight_layout(*args, **kwargs)
Out[14]:
<seaborn.axisgrid.FacetGrid at 0x2250c03c5d0>
In [15]:
sns.boxplot(data =df, x = "Occupation", y = "Purchase")
Out[15]:
<Axes: xlabel='Occupation', ylabel='Purchase'>
This column has outliers which may effect the performance of the machine learning models
In [16]:
sns.boxplot(data = df,x = "Age", y = "Purchase")
Out[16]:
<Axes: xlabel='Age', ylabel='Purchase'>
This column also have some outliers
In [17]:
sns.boxplot(data =df , x = 'Product_Category_1', y = "Purchase")
Out[17]:
<Axes: xlabel='Product_Category_1', ylabel='Purchase'>
This column has outliers
Data Preprocessing¶
In [18]:
df.head()
Out[18]:
| User_ID | Product_ID | Gender | Age | Occupation | City_Category | Stay_In_Current_City_Years | Marital_Status | Product_Category_1 | Product_Category_2 | Product_Category_3 | Purchase | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1000001 | P00069042 | F | 0-17 | 10 | A | 2 | 0 | 3 | NaN | NaN | 8370 |
| 1 | 1000001 | P00248942 | F | 0-17 | 10 | A | 2 | 0 | 1 | 6.0 | 14.0 | 15200 |
| 2 | 1000001 | P00087842 | F | 0-17 | 10 | A | 2 | 0 | 12 | NaN | NaN | 1422 |
| 3 | 1000001 | P00085442 | F | 0-17 | 10 | A | 2 | 0 | 12 | 14.0 | NaN | 1057 |
| 4 | 1000002 | P00285442 | M | 55+ | 16 | C | 4+ | 0 | 8 | NaN | NaN | 7969 |
In [19]:
df['Product_ID'] = df['Product_ID'].str.replace('P00', '') # replacing P00 with blank space
In [20]:
ss = StandardScaler()
In [21]:
df['Product_ID'] = ss.fit_transform(df['Product_ID'].values.reshape(-1,1))
In [22]:
df.drop(['Product_Category_3'], axis =1, inplace = True) # max no of missing values so we drop this col
In [23]:
df['Product_Category_2']= df['Product_Category_2'].fillna(df['Product_Category_2'].mean()) # filling nan values with the mean
In [24]:
df.isna().sum()
Out[24]:
User_ID 0 Product_ID 0 Gender 0 Age 0 Occupation 0 City_Category 0 Stay_In_Current_City_Years 0 Marital_Status 0 Product_Category_1 0 Product_Category_2 0 Purchase 0 dtype: int64
In [25]:
df.head()
Out[25]:
| User_ID | Product_ID | Gender | Age | Occupation | City_Category | Stay_In_Current_City_Years | Marital_Status | Product_Category_1 | Product_Category_2 | Purchase | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1000001 | -1.028774 | F | 0-17 | 10 | A | 2 | 0 | 3 | 9.842329 | 8370 |
| 1 | 1000001 | 0.722139 | F | 0-17 | 10 | A | 2 | 0 | 1 | 6.000000 | 15200 |
| 2 | 1000001 | -0.845799 | F | 0-17 | 10 | A | 2 | 0 | 12 | 9.842329 | 1422 |
| 3 | 1000001 | -0.869157 | F | 0-17 | 10 | A | 2 | 0 | 12 | 14.000000 | 1057 |
| 4 | 1000002 | 1.077382 | M | 55+ | 16 | C | 4+ | 0 | 8 | 9.842329 | 7969 |
Label Encoding¶
In [26]:
categorical_columns = ['Gender', 'City_Category', 'Age']
le = LabelEncoder()
for i in categorical_columns:
df[i] = le.fit_transform(df[i])
df.dtypes
Out[26]:
User_ID int64 Product_ID float64 Gender int32 Age int32 Occupation int64 City_Category int32 Stay_In_Current_City_Years object Marital_Status int64 Product_Category_1 int64 Product_Category_2 float64 Purchase int64 dtype: object
In [27]:
df['Stay_In_Current_City_Years'] = df['Stay_In_Current_City_Years'].replace('4+', '4')
In [28]:
# changing the datatypes to integer
df['Gender'] = df['Gender'].astype(int)
df['Age'] = df['Age'].astype(int)
df['Stay_In_Current_City_Years'] = df['Stay_In_Current_City_Years'].astype(int)
df['City_Category'] = df['City_Category'].astype('category')
In [29]:
--------------------------------------------------------------------------- NameError Traceback (most recent call last) Cell In[29], line 1 ----> 1 d NameError: name 'd' is not defined
In [30]:
df.head()
Out[30]:
| User_ID | Product_ID | Gender | Age | Occupation | City_Category | Stay_In_Current_City_Years | Marital_Status | Product_Category_1 | Product_Category_2 | Purchase | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1000001 | -1.028774 | 0 | 0 | 10 | 0 | 2 | 0 | 3 | 9.842329 | 8370 |
| 1 | 1000001 | 0.722139 | 0 | 0 | 10 | 0 | 2 | 0 | 1 | 6.000000 | 15200 |
| 2 | 1000001 | -0.845799 | 0 | 0 | 10 | 0 | 2 | 0 | 12 | 9.842329 | 1422 |
| 3 | 1000001 | -0.869157 | 0 | 0 | 10 | 0 | 2 | 0 | 12 | 14.000000 | 1057 |
| 4 | 1000002 | 1.077382 | 1 | 6 | 16 | 2 | 4 | 0 | 8 | 9.842329 | 7969 |
In [31]:
### Distribution Plots
In [32]:
rows = 3
cols = 3
fig, ax = plt.subplots(nrows=rows, ncols=cols, figsize=(10, 4))
col = df.columns
index = 2
for i in range(rows):
for j in range(cols):
if index < len(col):
sns.histplot(df[col[index]], ax=ax[i][j], kde=True) # You can use sns.kdeplot() for KDE plots
ax[i][j].set_title(col[index])
index += 1
else:
ax[i][j].axis("off") # Turn off empty subplots if there are fewer columns than rows*cols
plt.tight_layout()
plt.show()
In [33]:
df['Purchase'] =np.log(df['Purchase']) # transfer data into normal distribution
In [34]:
df = pd.get_dummies(df)
# used to convert categorical variable into dummy/indicator variables
df.head()
Out[34]:
| User_ID | Product_ID | Gender | Age | Occupation | Stay_In_Current_City_Years | Marital_Status | Product_Category_1 | Product_Category_2 | Purchase | City_Category_0 | City_Category_1 | City_Category_2 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1000001 | -1.028774 | 0 | 0 | 10 | 2 | 0 | 3 | 9.842329 | 9.032409 | True | False | False |
| 1 | 1000001 | 0.722139 | 0 | 0 | 10 | 2 | 0 | 1 | 6.000000 | 9.629051 | True | False | False |
| 2 | 1000001 | -0.845799 | 0 | 0 | 10 | 2 | 0 | 12 | 9.842329 | 7.259820 | True | False | False |
| 3 | 1000001 | -0.869157 | 0 | 0 | 10 | 2 | 0 | 12 | 14.000000 | 6.963190 | True | False | False |
| 4 | 1000002 | 1.077382 | 1 | 6 | 16 | 4 | 0 | 8 | 9.842329 | 8.983314 | False | False | True |
Machine Learning Part¶
In [35]:
X = df.drop(labels = ['Purchase'], axis =1)
y = df['Purchase']
In [36]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
# the data is split into 80 percent train size and 20 percent test size
In [37]:
ss = StandardScaler()
X_train = ss.fit_transform(X_train)
X_test = ss.transform(X_test)
Linear Regression¶
In [38]:
lr = LinearRegression()
lr.fit(X_train, y_train)
Out[38]:
LinearRegression()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
LinearRegression()
In [39]:
y_predict = lr.predict(X_test)
## predicting on X_test
print('r2_score:', r2_score(y_test, y_predict))
print('mean_absolute_error:', mean_absolute_error(y_test, y_predict))
print('mean_squared_error:', mean_squared_error(y_test, y_predict))
print('root_mean_squared_error',np.sqrt(mean_squared_error(y_test, y_predict)))
r2_score: 0.20164239829578356 mean_absolute_error: 0.45565817118315044 mean_squared_error: 0.44379631133591096 root_mean_squared_error 0.6661803894861443
In [40]:
# r2 score is 0.20 and other are higher and our model is not performing well this mean model is not very accurate to predict the purchase or the target columns
Decision Tree Regression¶
In [41]:
dtr = DecisionTreeRegressor(max_depth=9)
dtr.fit(X_train, y_train)
Out[41]:
DecisionTreeRegressor(max_depth=9)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
DecisionTreeRegressor(max_depth=9)
In [42]:
#predicting trrain
train_predict = dtr.predict(X_train)
#predicting test
test_predict=dtr.predict(X_test)
In [43]:
rmse_train = np.sqrt(metrics.mean_squared_error(y_train, train_predict))
rmse_test = np.sqrt(metrics.mean_squared_error(y_test, test_predict))
print('RMSE score for Training Data:', str(rmse_train))
print('RMSE score for Test Data:', str(rmse_test))
print("*"*20)
print('r2 score for train:', dtr.score(X_train, y_train))
print('r2 score for test:', dtr.score(X_test, y_test))
RMSE score for Training Data: 0.3680408214406253 RMSE score for Test Data: 0.3689567100682491 ******************** r2 score for train: 0.7519510621944241 r2 score for test: 0.7551136360952996
Random Forest Regression¶
In [44]:
Rf = RandomForestRegressor()
In [45]:
Rf.fit(X_train, y_train)
Out[45]:
RandomForestRegressor()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
RandomForestRegressor()
In [46]:
# predicting train data
rf_train_predict = Rf.predict(X_train)
# predicting test data
rf_test_predict = Rf.predict(X_test)
In [47]:
print(rf_train_predict)
[8.94589438 9.17581003 8.13538297 ... 8.84825964 9.3764006 9.16917216]
In [48]:
print(rf_test_predict)
[9.2016627 6.95881267 9.70558683 ... 9.70330271 9.79481092 9.5949654 ]
In [49]:
rmse_training = (np.sqrt(metrics.mean_squared_error(y_train, rf_train_predict)))
rmse_test = (np.sqrt(metrics.mean_squared_error(y_test, rf_test_predict)))
print('RMSE for training data:', rmse_training)
print('RMSE for test data:', rmse_test)
print('*'*50)
print('Rsquared value on training data:', Rf.score(X_train, y_train))
print('Rsquared value on test data:', Rf.score(X_test, y_test))
RMSE for training data: 0.13153861584112314 RMSE for test data: 0.34961990142644944 ************************************************** Rsquared value on training data: 0.9683151304061405 Rsquared value on test data: 0.7801097015706281
In [50]:
# Random Foresr regressor model is better than linear regression and decision tree regression,
# as we have a low root mean square value and the higher rsquared value
In [51]:
df_test = pd.read_csv('test.csv')
df_test.head()
Out[51]:
| User_ID | Product_ID | Gender | Age | Occupation | City_Category | Stay_In_Current_City_Years | Marital_Status | Product_Category_1 | Product_Category_2 | Product_Category_3 | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1000004 | P00128942 | M | 46-50 | 7 | B | 2 | 1 | 1 | 11.0 | NaN |
| 1 | 1000009 | P00113442 | M | 26-35 | 17 | C | 0 | 0 | 3 | 5.0 | NaN |
| 2 | 1000010 | P00288442 | F | 36-45 | 1 | B | 4+ | 1 | 5 | 14.0 | NaN |
| 3 | 1000010 | P00145342 | F | 36-45 | 1 | B | 4+ | 1 | 4 | 9.0 | NaN |
| 4 | 1000011 | P00053842 | F | 26-35 | 1 | C | 1 | 0 | 4 | 5.0 | 12.0 |
In [52]:
df_test.isna().sum()
Out[52]:
User_ID 0 Product_ID 0 Gender 0 Age 0 Occupation 0 City_Category 0 Stay_In_Current_City_Years 0 Marital_Status 0 Product_Category_1 0 Product_Category_2 72344 Product_Category_3 162562 dtype: int64
In [53]:
df_test['Product_ID'] = df_test['Product_ID'].str.replace('P00', '')
ss = StandardScaler()
df_test['Product_ID'] = ss.fit_transform(df_test['Product_ID'].values.reshape(-1,1))
In [54]:
df_test.drop(['Product_Category_3'], axis = 1, inplace = True)
In [55]:
df_test['Product_Category_2'] = df_test['Product_Category_2'].fillna(df_test['Product_Category_2'].mean())
In [56]:
df_test['Stay_In_Current_City_Years'] = df_test['Stay_In_Current_City_Years'].replace('4+', '4')
In [57]:
df_test.isna().sum()
Out[57]:
User_ID 0 Product_ID 0 Gender 0 Age 0 Occupation 0 City_Category 0 Stay_In_Current_City_Years 0 Marital_Status 0 Product_Category_1 0 Product_Category_2 0 dtype: int64
In [58]:
df_test.head()
Out[58]:
| User_ID | Product_ID | Gender | Age | Occupation | City_Category | Stay_In_Current_City_Years | Marital_Status | Product_Category_1 | Product_Category_2 | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1000004 | -0.434752 | M | 46-50 | 7 | B | 2 | 1 | 1 | 11.0 |
| 1 | 1000009 | -0.587188 | M | 26-35 | 17 | C | 0 | 0 | 3 | 5.0 |
| 2 | 1000010 | 1.133865 | F | 36-45 | 1 | B | 4 | 1 | 5 | 14.0 |
| 3 | 1000010 | -0.273465 | F | 36-45 | 1 | B | 4 | 1 | 4 | 9.0 |
| 4 | 1000011 | -1.173330 | F | 26-35 | 1 | C | 1 | 0 | 4 | 5.0 |
In [59]:
cat_cols = ['Gender', 'Age', 'City_Category']
le = LabelEncoder()
for i in cat_cols:
df_test[i] = le.fit_transform(df_test[i])
In [60]:
df_test.dtypes
Out[60]:
User_ID int64 Product_ID float64 Gender int32 Age int32 Occupation int64 City_Category int32 Stay_In_Current_City_Years object Marital_Status int64 Product_Category_1 int64 Product_Category_2 float64 dtype: object
In [61]:
df_test['Gender'] = df_test['Gender'].astype(int)
df_test['Age'] = df_test['Age'].astype(int)
df_test['Stay_In_Current_City_Years'] = df_test['Stay_In_Current_City_Years'].astype(int)
df_test['City_Category'] = df_test['City_Category'].astype('category')
In [62]:
df_test = pd.get_dummies(df_test)
In [63]:
df_test.head()
Out[63]:
| User_ID | Product_ID | Gender | Age | Occupation | Stay_In_Current_City_Years | Marital_Status | Product_Category_1 | Product_Category_2 | City_Category_0 | City_Category_1 | City_Category_2 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1000004 | -0.434752 | 1 | 4 | 7 | 2 | 1 | 1 | 11.0 | False | True | False |
| 1 | 1000009 | -0.587188 | 1 | 2 | 17 | 0 | 0 | 3 | 5.0 | False | False | True |
| 2 | 1000010 | 1.133865 | 0 | 3 | 1 | 4 | 1 | 5 | 14.0 | False | True | False |
| 3 | 1000010 | -0.273465 | 0 | 3 | 1 | 4 | 1 | 4 | 9.0 | False | True | False |
| 4 | 1000011 | -1.173330 | 0 | 2 | 1 | 1 | 0 | 4 | 5.0 | False | False | True |
In [64]:
df.shape
Out[64]:
(550068, 13)
In [65]:
df_test.shape #we have to predict the column purchase so there is difference in columns
Out[65]:
(233599, 12)
In [66]:
df
Out[66]:
| User_ID | Product_ID | Gender | Age | Occupation | Stay_In_Current_City_Years | Marital_Status | Product_Category_1 | Product_Category_2 | Purchase | City_Category_0 | City_Category_1 | City_Category_2 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1000001 | -1.028774 | 0 | 0 | 10 | 2 | 0 | 3 | 9.842329 | 9.032409 | True | False | False |
| 1 | 1000001 | 0.722139 | 0 | 0 | 10 | 2 | 0 | 1 | 6.000000 | 9.629051 | True | False | False |
| 2 | 1000001 | -0.845799 | 0 | 0 | 10 | 2 | 0 | 12 | 9.842329 | 7.259820 | True | False | False |
| 3 | 1000001 | -0.869157 | 0 | 0 | 10 | 2 | 0 | 12 | 14.000000 | 6.963190 | True | False | False |
| 4 | 1000002 | 1.077382 | 1 | 6 | 16 | 4 | 0 | 8 | 9.842329 | 8.983314 | False | False | True |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 550063 | 1006033 | 1.924156 | 1 | 5 | 13 | 1 | 1 | 20 | 9.842329 | 5.908083 | False | True | False |
| 550064 | 1006035 | 1.953267 | 0 | 2 | 1 | 3 | 0 | 20 | 9.842329 | 5.916202 | False | False | True |
| 550065 | 1006036 | 1.953267 | 0 | 2 | 15 | 4 | 1 | 20 | 9.842329 | 4.919981 | False | True | False |
| 550066 | 1006038 | 1.953267 | 0 | 6 | 1 | 2 | 0 | 20 | 9.842329 | 5.899897 | False | False | True |
| 550067 | 1006039 | 1.916360 | 0 | 4 | 0 | 4 | 1 | 20 | 9.842329 | 6.194405 | False | True | False |
550068 rows × 13 columns
In [67]:
df_test
Out[67]:
| User_ID | Product_ID | Gender | Age | Occupation | Stay_In_Current_City_Years | Marital_Status | Product_Category_1 | Product_Category_2 | City_Category_0 | City_Category_1 | City_Category_2 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1000004 | -0.434752 | 1 | 4 | 7 | 2 | 1 | 1 | 11.000000 | False | True | False |
| 1 | 1000009 | -0.587188 | 1 | 2 | 17 | 0 | 0 | 3 | 5.000000 | False | False | True |
| 2 | 1000010 | 1.133865 | 0 | 3 | 1 | 4 | 1 | 5 | 14.000000 | False | True | False |
| 3 | 1000010 | -0.273465 | 0 | 3 | 1 | 4 | 1 | 4 | 9.000000 | False | True | False |
| 4 | 1000011 | -1.173330 | 0 | 2 | 1 | 1 | 0 | 4 | 5.000000 | False | False | True |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 233594 | 1006036 | -0.533098 | 0 | 2 | 15 | 4 | 1 | 8 | 9.849586 | False | True | False |
| 233595 | 1006036 | 0.801456 | 0 | 2 | 15 | 4 | 1 | 5 | 8.000000 | False | True | False |
| 233596 | 1006036 | -1.389691 | 0 | 2 | 15 | 4 | 1 | 1 | 5.000000 | False | True | False |
| 233597 | 1006037 | -0.476058 | 0 | 4 | 1 | 4 | 0 | 10 | 16.000000 | False | False | True |
| 233598 | 1006039 | 1.411200 | 0 | 4 | 0 | 4 | 1 | 4 | 5.000000 | False | True | False |
233599 rows × 12 columns
In [68]:
test_pred = Rf.predict(df_test)
len(test_pred)
D:\Projects\analysis\Lib\site-packages\sklearn\base.py:457: UserWarning: X has feature names, but RandomForestRegressor was fitted without feature names warnings.warn(
Out[68]:
233599
Selecting Random Forest Regressor to predict on out test dataset¶
In [69]:
frame = pd.read_csv('test.csv')
frame_info = frame[['User_ID', 'Product_ID', 'Gender', 'Occupation']]
frame_info.head()
Out[69]:
| User_ID | Product_ID | Gender | Occupation | |
|---|---|---|---|---|
| 0 | 1000004 | P00128942 | M | 7 |
| 1 | 1000009 | P00113442 | M | 17 |
| 2 | 1000010 | P00288442 | F | 1 |
| 3 | 1000010 | P00145342 | F | 1 |
| 4 | 1000011 | P00053842 | F | 1 |
In [70]:
# creating dataframe of prediction
prediction = pd.DataFrame(test_pred, columns=['Purchase'])
prediction['User_ID'] = frame_info['User_ID']
prediction['Product_ID'] = frame_info['Product_ID']
prediction['Gender'] = frame_info['Gender']
prediction['Occupation'] = frame_info['Occupation']
prediction.head()
Out[70]:
| Purchase | User_ID | Product_ID | Gender | Occupation | |
|---|---|---|---|---|---|
| 0 | 9.553715 | 1000004 | P00128942 | M | 7 |
| 1 | 9.609040 | 1000009 | P00113442 | M | 17 |
| 2 | 4.515243 | 1000010 | P00288442 | F | 1 |
| 3 | 4.515243 | 1000010 | P00145342 | F | 1 |
| 4 | 4.686747 | 1000011 | P00053842 | F | 1 |
In [76]:
import plotly.express as px
px.histogram(prediction, x = 'Gender', y = 'Purchase')
In [73]:
# converting the prediction into csv
prediction.to_csv('BlackFridayPrediction.csv', index = False)
In [ ]: